#! /bin/bash

unidata="emoji-data.txt emoji-variation-sequences.txt emoji-sequences.txt emoji-zwj-sequences.txt"
make $unidata >&2

case "$1" in

base)

# filter the extracted lists by replacing the first ";" with "#" to 
# enable proper range recognition by uniset
grep "; *Emoji_Presentation *#" emoji-data.txt |
	sed -e "s/;/#/" > .emoji.pres
grep "; *Extended_Pictographic *#" emoji-data.txt |
	sed -e "s/;/#/" > .emoji.pict

sed -e "s/ FE0E *;/;/" -e t -e d emoji-variation-sequences.txt |
	sed -e "s/;/#/" > .emoji.text
sed -e "s/ FE0F *;/;/" -e t -e d emoji-variation-sequences.txt |
	sed -e "s/;/#/" > .emoji.emoj

sed -e "s/[ ;].*//" emoji-sequences.txt emoji-zwj-sequences.txt |
	sed -e "s/;/#/" > .emoji.base

echo tagging emojis >&2
for tag in pres pict text emoj base
do	uniset +.emoji.$tag table |
	sed -e "/^.... / s/^/0/" -e "s/#.*//" -e "s/ *$/ $tag/" > .emoji_$tag
done

join="join -a 1 -a 2"

echo joining emojis >&2
$join <( $join <( $join <( $join .emoji_base .emoji_text) .emoji_emoj) .emoji_pres) .emoji_pict |
sed -e "s/ / | EM_/g" -e "s/\([^ ]*\)\(.*\)/  {0, 0\2, 0x\1},/"

;;

seqs0)

sed -e "s/;.*//" -e "s/ *$//" -e "/^#/ d" -e "/^$/ d" emoji-sequences.txt emoji-zwj-sequences.txt |
sed -e "/^.... / s/^/0/" -e "s/ /, 0x/g" -e "s/^/  {0, {0x/" -e "s/$/}},/" |
sed -e "s/\(0x[0-9A-F]*\)/ee(\1)/g" |
sort | uniq
# Note that by sorting order of ',' and '}', 
# shorter sequences are sorted behind longer sequences, 
# which is an important property for proper matching.

;;

seqs)

sed -e '/^#/ d' -e 's/#.*//' -e 's/\\x{23}/#/' -e '/^$/ d' -e 's/ *;.*; */	"/' -e 's/ *$/"/' emoji-sequences.txt emoji-zwj-sequences.txt |
sed -e ': x' -e 's/ \(.*	\)/,0x\1/' -e 't x' -e 's/,0x/, 0x/g' -e 's/\(	.* \)0x/\1/g' |
sed -e 's/^/  {0, {0x/' -e 's/	/}, /' -e 's/$/},/' |
sed -e 's/\(0x[0-9A-F]*\)/ee(\1)/g' |
sed -e 's/"Pirate Flag"/"pirate flag"/' |
sed -e 's/0x\([0-9A-F][0-9A-F][0-9A-F][0-9A-F][^0-9A-F]\)/0x0\1/g' |
sort | uniq |
sed -e 's/0x0/0x/g'
# Note that by sorting order of ',' and '}', 
# shorter sequences are sorted behind longer sequences, 
# which is an important property for proper matching.

;;

esac
